
# Generate Appendix Figure S5

# Set up ####
library(pacman)
p_load(ggplot2, dplyr, magrittr, rdrobust, here)

theme_ed <- theme(
  legend.position = "bottom",
  panel.background = element_rect(fill = NA),
  panel.border = element_rect(fill = NA, color = "grey75"),
  axis.ticks = element_line(color = "grey85"),
  panel.grid.major = element_line(color = "grey95", size = 0.2),
  panel.grid.minor = element_line(color = "grey95", size = 0.2),
  legend.key = element_blank())

# Load and Clean Data ####
load(here("Survey Only", "Temp_Data", "tfa_to_state_ALL_flags_v2.RData"))

### Modify birth and grad year variables 
tfa_dat_flag %<>% 
  # Make vars numeric instead of character
  mutate_at(vars(DegYear, birthyear_app, dob_survey, BirthYear), 
            ~as.numeric(as.character(.))) %>%
  # Standardize birthyear var name
  mutate(birthyear_survey = dob_survey) %>%
  # Missingness
  mutate(isna_birth_app = as.numeric(is.na(birthyear_app)),
         isna_birth_survey = as.numeric(is.na(dob_survey)),
         isna_grad = as.numeric(is.na(DegYear)),
         isna_BirthYear = as.numeric(is.na(BirthYear)) ) %>%  
  # Diff btw known DoB and Grad year 
  mutate(diff_app = DegYear-birthyear_app,
         diff_survey = DegYear-dob_survey ) %>%
  # Improve labels 
  mutate(mat4_txt = ifelse(matriculated4==1, "Matriculant", "Non-matriculant"),
         isna_birth_app_txt = ifelse(isna_birth_app, "100% Missing", "0% Missing"),
         isna_grad_txt = ifelse(isna_grad, "100% Missing", "0% Missing"),
         isna_BirthYear_txt = ifelse(isna_BirthYear, "100% Missing", "0% Missing"),
         isna_birth_survey_txt = ifelse(isna_birth_survey, "100% Missing", "0% Missing"),
         z = ifelse(zscore>=0, "Above cutoff", "Below cutoff"),
         est_birth = ifelse(appyear %in% c(2007:2009,2014:2015),
                            "Estimated", "Not Estimated") ) %>%
  mutate_at(vars(mat4_txt, z, est_birth), as.factor) %>%
  # Create 3 cat treatment variable
  mutate(treat = ifelse(admitted == "N", "Rejected", NA), 
         treat = ifelse(matriculated4==1 & admitted=="Y", "Praticipant", treat),
         treat = ifelse(matriculated4==0 & admitted=="Y", "Admitted only", treat),
         treat = factor(treat, levels = c("Praticipant", "Admitted only", "Rejected")) ) %>%
  # Distance between 
  mutate(est_diff_survey = abs(birthyear_survey - (DegYear-22)),
         est_diff_survey_0 = ifelse(est_diff_survey==0, 1, 0),
         est_diff_survey_1only = ifelse(est_diff_survey==1, 1, 0),
         est_diff_survey_2to3 = ifelse(est_diff_survey<=3 & est_diff_survey>1, 1, 0),
         est_diff_app = abs(birthyear_app - (DegYear-22)),
         est_diff_app_0 = ifelse(est_diff_app==0, 1, 0),
         est_diff_app_1only = ifelse(est_diff_app==1, 1, 0),
         est_diff_app_2to3 = ifelse(est_diff_app<=3 & est_diff_app>1, 1, 0) )


# Create plots ####

# Function for generating plots
rdplot_est_diff <- function(dat=tfa_dat_flag, 
                            type=c("survey", "app"), 
                            diff=c("0","1only","2to3"))
{
  # Filter and gen yvvar
  if(type=="app"){
    # subset data
    #dat %<>% filter(appyear %in% c(2007:2009,2014:2015))
    # gen var
    yvar <- paste0("est_diff_app_", diff)
  }
  if(type=="survey"){
    # subset data
    dat %<>% filter(!is.na(state_st) & state_st!="")
    # gen var
    yvar <- paste0("est_diff_survey_", diff)
  }
  # Generate title
  if(diff=="0") t <- "0 years difference"
  if(diff=="1only") t <- "1 years difference"
  if(diff=="2to3") t <- "2-3 years difference"
  # Gen plot 
  dat %>% 
    ggplot(aes_string(x = "zscore", y = yvar, group="z")) + 
    stat_summary_bin(fun.x='mean_se', bins=30, color='black', fill='black', 
                     alpha=.5, size=2, geom='point', shape=21) +
    geom_smooth(method = "loess", se=FALSE, color="steelblue", span=10) +
    geom_vline(aes(xintercept = 0), linetype="dashed", color = 'red3') +
    ylab(NULL) + ggtitle(t) +
    xlim(-1,1) + theme_ed 
}
# Generate combined plot
(rdplot_est_diff(type="survey", diff="0") + 
    rdplot_est_diff(type="survey", diff="1only") + 
    rdplot_est_diff(type="survey", diff="2to3")) +
  plot_annotation(
    title = "Proportion Obs with Birthyear (survey) within 0-3 years of (Gradyear-22)",
    subtitle=paste0("")
  )